
library(dplyr)
library(readstata13)


setwd("C:/Users/kevin/Dropbox/Social_Media_Lab/Data_Survey/YouGov_Data/apsr_replication/")

nr<-read.dta13("nr_controls.dta")


load("combined_tweets.RData")





###Make sure the coding is in the correct direction for the following variables:
#twitter_post_political_w1 facebook_post_political_w1 
#         twitter_post_political_w2 facebook_post_political_w2
#         twitter_post_political_w4
#         facebook_post_political_w4
#         twitter_see_pol_info_w2
#         facebook_see_pol_info_w2   pol_info_freq_twitter_w4 pol_info_freq_fb_w4
#         twitter_freq_w4 facebook_freq_w4
#      recode internet_freq_w1  internet_freq_w4 




data$twitter_freq<-NULL
data$facebook_freq<-NULL
data$internet_freq<-NULL
data$twitter_see_pol_info<-NULL
data$twitter_post_political<-NULL
data$facebook_post_political<-NULL

data$twitter_freq_inc<-rowMeans(cbind(data$twitter_freq_w1, data$twitter_freq_w4), T)
data$facebook_freq_inc<-rowMeans(cbind(data$facebook_freq_w1, data$facebook_freq_w4), T)
data$internet_freq_inc<-rowMeans(cbind(data$internet_freq_w1, data$internet_freq_w4), T)
data$twitter_see_pol_info_inc<-rowMeans(cbind(data$twitter_see_pol_info_w4, data$twitter_see_pol_info_w2), T)
data$twitter_post_political_inc<-rowMeans(cbind(data$twitter_post_political_w1, data$twitter_post_political_w2, data$twitter_post_political_w4), T)



nr$twitter_freq<-NULL
nr$facebook_freq<-NULL
nr$internet_freq<-NULL
nr$twitter_see_pol_info<-NULL
nr$twitter_post_political<-NULL
nr$facebook_post_political<-NULL

nr$twitter_freq_inc<-rowMeans(cbind(nr$twitter_freq_w1, nr$twitter_freq_w4), T)
nr$facebook_freq_inc<-rowMeans(cbind(nr$facebook_freq_w1, nr$facebook_freq_w4), T)
nr$internet_freq_inc<-rowMeans(cbind(nr$internet_freq_w1, nr$internet_freq_w4), T)
nr$twitter_see_pol_info_inc<-rowMeans(cbind(nr$twitter_see_pol_info_w4, nr$twitter_see_pol_info_w2), T)
nr$twitter_post_political_inc<-rowMeans(cbind(nr$twitter_post_political_w1, nr$twitter_post_political_w2, nr$twitter_post_political_w4), T)




##what we want are people who don't use Twitter--this adds a large number of "control" observations


##so they either report not using Twitter or report using it "never" after reporting using it

nr_no_twitter<-filter(nr, twitter_user==0 | twitter_freq_inc  <2)


##check on matches -- someone had a birthday
summary(lm(nr_no_twitter$age_w2 ~ nr_no_twitter$age_w3))


##but we also want ppl who DO use Twitter; this lets us compare our SOMA sample

nr_yes_twitter<-filter(nr, twitter_user==1 & twitter_freq_inc  >1)


##############################################################
##calculate demographic info for this group
##############################################################

############################################################## make sure same variables for match
nr_no_twitter$gender<-rowMeans(cbind(nr_no_twitter$gender_w1,nr_no_twitter$gender_w2,
                                     nr_no_twitter$gender_w3, nr_no_twitter$gender_w4), T)

data$gender<-rowMeans(cbind(data$gender_w1,data$gender_w2,
                                     data$gender_w3, data$gender_w4), T)

nr_no_twitter$profile_education_age<-rowMeans(cbind(nr_no_twitter$profile_education_age_w2,nr_no_twitter$profile_education_age_w3,
                                     nr_no_twitter$profile_education_age_w4), T)


nr_no_twitter$profile_income<-rowMeans(cbind(nr_no_twitter$profile_income_old_w2,nr_no_twitter$profile_income_old_w3,
                                                    nr_no_twitter$profile_income_old_w4), T)

data$profile_income<-rowMeans(cbind(data$profile_income_old_w2,data$profile_income_old_w3,
                                             data$profile_income_old_w4), T)



######merge 

full_data<-merge(nr_no_twitter, data, all =T)



#####create pid variable

full_data$pid<-full_data$profile_partyid_w2


full_data$pid[is.na(full_data$profile_partyid_w2)==T]<-full_data$profile_partyid_w3[is.na(full_data$profile_partyid_w2)==T]


###categorical variables for the main ones

full_data$cons_pid<-as.numeric(full_data$pid)

full_data$cons_pid[full_data$cons_pid>1]<-0

full_data$lab_pid<-as.numeric(full_data$pid)

full_data$lab_pid[full_data$lab_pid!=2]<-0

full_data$libdem_pid<-as.numeric(full_data$pid)

full_data$libdem_pid[full_data$libdem_pid!=3]<-0

##now we want to replace all the "tweets" columns with zeros, so they can be controls

##note that this entails changing some ppl for whom we should have sme tweet data
data_no_twitter<-filter(full_data, twitter_user==0 | twitter_freq_inc  <2)

data_yes_twitter<-filter(full_data, twitter_user==1 & twitter_freq_inc  >=2)



##grab only the right columns--divide them at each step
tweets<-dplyr::select(data_no_twitter, starts_with("p"))
tweets$tweets_all_topics<-data_no_twitter$tweets_all_topics


no_tweets<-dplyr::select(data_no_twitter, -starts_with("p"))
no_tweets$tweets_all_topics<-NULL
##

real_tweets<-dplyr::select(tweets, matches("p._") )
real_tweets$tweets_all_topics<-tweets$tweets_all_topics

no_real_tweets<-dplyr::select(tweets, -matches("p._"))
no_real_tweets$tweets_all_topics<-NULL
##cleansing fire of the zeros

real_tweets[is.na(real_tweets) ==TRUE]<-0
real_tweets[is.na(real_tweets) ==FALSE]<-0



##combine the zeros tweet data with the original data from that file

combined_tweets<-cbind(real_tweets, no_real_tweets)

combined_data_no_twitter<-cbind(combined_tweets, no_tweets)


##stack the new data frames

data0<-merge(combined_data_no_twitter, data_yes_twitter, all=T)


##Now we give zeros to ppl who don't use Twitter

data0$twitter_freq_inc[data0$twitter_user==0]<-1
data0$facebook_freq_inc[data0$facebook_user==0]<-1
data0$twitter_see_pol_info_inc[data0$twitter_user==0]<-1
data0$twitter_post_political_inc[data0$twitter_user==0]<-1




rm(list=setdiff(ls(), "data0"))

data<-data0
rm(list=setdiff(ls(), "data"))


save.image("merged_nr_soma.RData")
